Data Extraction for preselected commodities portfolio

Author

Rodrigo Hermont Ozon, Érick Oliveira Rodrigues

Published

October 7, 2024

Abstract

This small document have the goal to share the time series extraction and the two basic features building, like price returns and their conditional variance…



Intro

[… to be written …]


Python codes

Code

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from arch import arch_model
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotnine import ggplot, aes, geom_line, facet_wrap, labs, theme, element_text, theme_minimal

The portfolio contains the following commodities price returns:

  • Corn Futures
  • Wheat Futures
  • KC HRW Wheat Futures
  • Rough Rice Futures
  • Feeder Cattle Futures
  • SoyMeal Futures
  • Soy Meal Futures
  • SoyBeans Futures
Code
# Tickers for portfolio
TICKERS = [
    "ZC=F",  # Corn Futures
    "ZO=F",  # Wheat Futures
    "KE=F",  # KC HRW Wheat Futures
    "ZR=F",  # Rough Rice Futures
    "GF=F",  # Feeder Cattle Futures
    "ZS=F",  # SoyMeal Futures
    "ZM=F",  # Soybean Meal Futures
    "ZL=F"   # SoyBeans Futures
]


# Downloading data from Yahoo Finance
portfolio_prices = yf.download(TICKERS, start="2019-01-01")['Adj Close']

[                       0%                       ]
[************          25%                       ]  2 of 8 completed
[******************    38%                       ]  3 of 8 completed
[**********************50%                       ]  4 of 8 completed
[**********************62%*****                  ]  5 of 8 completed
[**********************75%***********            ]  6 of 8 completed
[**********************88%*****************      ]  7 of 8 completed
[*********************100%***********************]  8 of 8 completed
Code
portfolio_prices.dropna(inplace=True)

# Renaming columns for better readability
portfolio_prices.columns = [
    "corn_fut",
    "wheat_fut",
    "KCWheat_fut",
    "rice_fut",
    "Feeder_Cattle",
    "soymeal_fut",
    "soyF_fut",
    "soybeans_fut"
]

Showing the prices time series side by side: (data in level)

Code
from plotnine import ggplot, aes, geom_line, facet_wrap, labs, theme, element_text, theme_minimal, theme_void

# Preparar os dados no formato long (necessário para plotnine/ggplot2)
portfolio_prices_long = portfolio_prices.reset_index().melt(id_vars='Date', var_name='Commodity', value_name='Price')

def plot_with_ggplot(data, title, ylabel, background='white', fig_height=10, fig_width=10):
    # Cria o gráfico usando plotnine (ggplot)
    p = (ggplot(data, aes(x='Date', y='Price', color='Commodity')) +
         geom_line() +
         facet_wrap('~Commodity', ncol=1, scales='free_y') +  # Um gráfico em cima do outro
         labs(title=title, x='Date', y=ylabel) +
         theme_minimal() +  # Define o tema minimalista com fundo branco
         theme(
             figure_size=(fig_width, fig_height),  # Ajuste da altura e largura da figura
             panel_background=element_text(fill=background),
             plot_background=element_text(fill=background),
             axis_text_x=element_text(rotation=45, hjust=1),
             subplots_adjust={'wspace': 0.25, 'hspace': 0.5}  # Ajuste do espaçamento entre os gráficos
         ))
    return p

p_prices = plot_with_ggplot(portfolio_prices_long, 'Commodity Prices Over Time', 'Price', background='white', fig_height=14, fig_width=8)

p_prices
<string>:2: FutureWarning: Using repr(plot) to draw and show the plot figure is deprecated and will be removed in a future version. Use plot.show().
C:\Users\Rodrigo\AppData\Local\Programs\Python\PYTHON~1\Lib\site-packages\plotnine\themes\themeable.py:2419: FutureWarning: You no longer need to use subplots_adjust to make space for the legend or text around the panels. This paramater will be removed in a future version. You can still use 'plot_margin' 'panel_spacing' for your other spacing needs.
<Figure Size: (800 x 1400)>

Obtain the returns time series (first feature):

\[ \mbox{Price log returns}_t = ln(p_t) - ln(p_{t-1}) \]

Code

# Calculate log returns
portfolio_log_returns = np.log(portfolio_prices / portfolio_prices.shift(1)).dropna()
portfolio_log_returns.columns = [
    "ret_corn_fut",
    "ret_wheat_fut",
    "ret_KCWheat_fut",
    "ret_rice_fut",
    "ret_Feeder_Cattle",
    "ret_soymeal_fut",
    "ret_soyF_fut",
    "ret_soybeans_fut"
]

And plot it:

Code
# Preparar os dados no formato long para os log-retornos
portfolio_log_returns_long = portfolio_log_returns.reset_index().melt(id_vars='Date', var_name='Commodity', value_name='Log Return')

def plot_log_returns_with_ggplot(data, title, ylabel, background='white', fig_height=10, fig_width=10):
    # Cria o gráfico usando plotnine (ggplot)
    p = (ggplot(data, aes(x='Date', y='Log Return', color='Commodity')) +
         geom_line() +
         facet_wrap('~Commodity', ncol=1, scales='free_y') +  # Um gráfico em cima do outro
         labs(title=title, x='Date', y=ylabel) +
         theme_minimal() +  # Define o tema minimalista com fundo branco
         theme(
             figure_size=(fig_width, fig_height),  # Ajuste da altura e largura da figura
             panel_background=element_text(fill=background),
             plot_background=element_text(fill=background),
             axis_text_x=element_text(rotation=45, hjust=1),
             subplots_adjust={'wspace': 0.25, 'hspace': 0.5}  # Ajuste do espaçamento entre os gráficos
         ))
    return p

p_log_returns = plot_log_returns_with_ggplot(portfolio_log_returns_long, 'Log Returns of Commodities Over Time', 'Log Return', background='white', fig_height=12, fig_width=8)

# Exibir o gráfico
p_log_returns
<string>:3: FutureWarning: Using repr(plot) to draw and show the plot figure is deprecated and will be removed in a future version. Use plot.show().
C:\Users\Rodrigo\AppData\Local\Programs\Python\PYTHON~1\Lib\site-packages\plotnine\themes\themeable.py:2419: FutureWarning: You no longer need to use subplots_adjust to make space for the legend or text around the panels. This paramater will be removed in a future version. You can still use 'plot_margin' 'panel_spacing' for your other spacing needs.
<Figure Size: (800 x 1200)>

As risk measure, we use the conditional variances (volatilities), to deal better with day by day of the prices log-returns.

The GARCH(1,1) model with an asymmetric Student-t distribution is not directly available in most Python libraries. However, we can still use a GARCH(1,1) model with a standard Student-t distribution to estimate the conditional variance. The GARCH(1,1) model is represented as follows:

\[ r_t = \mu + \epsilon_t \]

\[ \epsilon_t = \sigma_t z_t, \quad z_t \sim t_{\nu}(0, 1) \]

\[ \sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2 + \beta \sigma_{t-1}^2 \]

Where:

  • \(r_t\) is the log-return at time \(t\).
  • \(\mu\) is the mean of the returns.
  • \(\epsilon_t\) is the error term, modeled as conditional on past information.
  • \(\sigma_t^2\) is the conditional variance at time \(t\).
  • \(\omega, \alpha, \beta\) are the parameters to be estimated, with \(\omega > 0, \alpha \geq 0, \beta \geq 0\).
  • \(z_t\) follows a Student-t distribution with \(\nu\) degrees of freedom to capture the heavy tails observed in financial returns.
Code
# Initialize an empty DataFrame to store conditional variances
cond_variances = pd.DataFrame(index=portfolio_log_returns.index, columns=portfolio_log_returns.columns)

# Loop through each commodity's log-returns and fit a GARCH(1,1) model
for col in portfolio_log_returns.columns:
    # Fit a GARCH(1,1) model with a Student-t distribution for each series of log returns
    model = arch_model(portfolio_log_returns[col], vol='Garch', p=1, q=1, dist='t')
    res = model.fit(disp='off')
    
    # Extract conditional variances and store them in the DataFrame
    cond_variances[col] = res.conditional_volatility

# Show the first few rows of the conditional variances DataFrame
cond_variances.head()
                           ret_corn_fut  ...  ret_soybeans_fut
Date                                     ...                  
2019-01-03 00:00:00+00:00      0.039012  ...          0.013789
2019-01-04 00:00:00+00:00      0.054648  ...          0.015230
2019-01-07 00:00:00+00:00      0.066715  ...          0.016926
2019-01-08 00:00:00+00:00      0.076912  ...          0.016091
2019-01-09 00:00:00+00:00      0.085906  ...          0.016364

[5 rows x 8 columns]

and visualizing them:

Code
# Preparar os dados no formato long para as variâncias condicionais
cond_variances_long = cond_variances.reset_index().melt(id_vars='Date', var_name='Commodity', value_name='Conditional Variance')

# Função para criar o gráfico com fundo branco ou transparente e ajustar o tamanho da figura
def plot_cond_variances_with_ggplot(data, title, ylabel, background='white', fig_height=10, fig_width=10):
    # Cria o gráfico usando plotnine (ggplot)
    p = (ggplot(data, aes(x='Date', y='Conditional Variance', color='Commodity')) +
         geom_line() +
         facet_wrap('~Commodity', ncol=1, scales='free_y') +  # Um gráfico em cima do outro
         labs(title=title, x='Date', y=ylabel) +
         theme_minimal() +  # Define o tema minimalista com fundo branco
         theme(
             figure_size=(fig_width, fig_height),  # Ajuste da altura e largura da figura
             panel_background=element_text(fill=background),
             plot_background=element_text(fill=background),
             axis_text_x=element_text(rotation=45, hjust=1),
             subplots_adjust={'wspace': 0.25, 'hspace': 0.5}  # Ajuste do espaçamento entre os gráficos
         ))
    return p

# Exemplo de uso para as variâncias condicionais das commodities
p_cond_variances = plot_cond_variances_with_ggplot(cond_variances_long, 'Conditional Variances Over Time (GARCH(1,1))', 'Conditional Variance', background='white', fig_height=12, fig_width=8)

p_cond_variances
<string>:2: FutureWarning: Using repr(plot) to draw and show the plot figure is deprecated and will be removed in a future version. Use plot.show().
C:\Users\Rodrigo\AppData\Local\Programs\Python\PYTHON~1\Lib\site-packages\plotnine\themes\themeable.py:2419: FutureWarning: You no longer need to use subplots_adjust to make space for the legend or text around the panels. This paramater will be removed in a future version. You can still use 'plot_margin' 'panel_spacing' for your other spacing needs.
<Figure Size: (800 x 1200)>


R codes

Code
library(tidyverse)
library(dplyr)
library(ggplot2)
#library(plotly)
library(rugarch)
library(timeSeries)
library(fPortfolio)
library(quantmod)
library(caTools)
library(PerformanceAnalytics)
library(MASS)
library(PortfolioAnalytics)
library(ROI)
require(ROI.plugin.glpk)
require(ROI.plugin.quadprog)
library(quadprog)
library(corpcor)
library(DEoptim)
library(cowplot) # devtools::install_github("wilkelab/cowplot/")
library(lattice)
library(timetk)

Loading time series data, for portfolio setting…

Code
tickers <- c(
         "ZC=F", # Corn Futures
         "ZO=F", # Wheat Futures
         "KE=F", # Futuros KC HRW Wheat Futures
         "ZR=F", # Rough Rice Futures
         "GF=F", # Feeder Cattle Futures
         "ZS=F", # SoyMeal Futures 
         "ZM=F", # Futuros farelo soja
         "ZL=F"  # SoyBeans Futures
)

Obtain daily prices and their returns:

Code
portfolioPrices <- NULL
  for ( Ticker in tickers )
    portfolioPrices <- cbind(
      portfolioPrices, 
      getSymbols.yahoo(
        Ticker,
        from = "2019-01-01",
        auto.assign = FALSE
      )[,4]
    )

portfolioPrices <- portfolioPrices[apply(portfolioPrices, 1, function(x) all(!is.na(x))),]

colnames(portfolioPrices) <- c(
  "corn_fut",
  "wheat_fut",
  "KCWheat_fut",
  "rice_fut",
  "Feeder_Cattle",
  "soymeal_fut",
  "soyF_fut",
  "soybeans_fut"
)

tail(portfolioPrices)
           corn_fut wheat_fut KCWheat_fut rice_fut Feeder_Cattle soymeal_fut
2024-09-30   424.75    392.50      583.75   1529.5       246.200     1057.00
2024-10-01   429.00    388.00      598.25   1530.0       246.150     1057.25
2024-10-02   432.50    389.75      619.25   1516.5       249.725     1056.00
2024-10-03   428.25    383.75      611.50   1517.0       248.975     1046.00
2024-10-04   424.75    388.25      598.00   1509.5       249.625     1037.75
2024-10-07   426.00    395.75      603.25   1511.5       248.850     1034.00
           soyF_fut soybeans_fut
2024-09-30    344.3        43.51
2024-10-01    350.0        42.91
2024-10-02    341.4        43.72
2024-10-03    332.8        44.69
2024-10-04    330.5        44.04
2024-10-07    324.8        44.56

Plotting the time series prices (in level):

Code
portfolioPrices |> as.data.frame() |>
  mutate(
    time = seq_along( corn_fut )
  ) |>
  pivot_longer(
    !time,
    names_to = "Variables",
    values_to = "Value"  
      ) |>
  group_by(Variables) |>
  plot_time_series(
    time,
    Value,
    .interactive = F, # Change for TRUE for better visualization
    .facet_ncol = 2,
    .smooth = FALSE
  ) +
  theme(
    strip.background = element_rect(fill = "white", colour = "white")
  )

Obtain the returns time series (first feature):

\[ \mbox{Price log returns}_t = ln(p_t) - ln(p_{t-1}) \]

Code
# Calculate log returns for the portfolio prices
portfolioReturs <- na.omit(diff(log(portfolioPrices))) |> as.data.frame()

colnames(portfolioReturs) <- c(
  "ret_corn_fut",
  "ret_wheat_fut",
  "ret_KCWheat_fut",
  "ret_rice_fut",
  "ret_Feeder_Cattle",
  "ret_soymeal_fut",
  "ret_soyF_fut",
  "ret_soybeans_fut"
)

glimpse(portfolioReturs)
Rows: 1,450
Columns: 8
$ ret_corn_fut      <dbl> 0.0105891128, 0.0085218477, -0.0019601444, -0.005903…
$ ret_wheat_fut     <dbl> 0.0008980692, 0.0053715438, -0.0017873106, 0.0115608…
$ ret_KCWheat_fut   <dbl> 0.0220892515, 0.0049529571, -0.0059464992, 0.0039682…
$ ret_rice_fut      <dbl> 0.0083930387, 0.0053934919, 0.0174507579, 0.00813596…
$ ret_Feeder_Cattle <dbl> -0.0096783375, -0.0111522135, 0.0075628145, 0.011068…
$ ret_soymeal_fut   <dbl> 0.0061281529, 0.0102224954, 0.0030190774, -0.0065988…
$ ret_soyF_fut      <dbl> 0.0054513913, 0.0076457646, 0.0097900861, -0.0018874…
$ ret_soybeans_fut  <dbl> 0.0099858421, 0.0081286732, -0.0052938051, -0.002834…
Code
#portfolioReturs <- as.timeSeries(portfolioReturs)

Plot all time series and their returns:

Code
portfolioReturs |> 
  mutate(
    time = seq_along( ret_corn_fut )
  ) |>
  pivot_longer(
    !time,
    names_to = "Variables",
    values_to = "Value"  
      ) |>
  group_by(Variables) |>
  plot_time_series(
    time,
    Value,
    .interactive = F, # Change for TRUE for better visualization
    .facet_ncol = 2,
    .smooth = FALSE
  ) +
  theme(
    strip.background = element_rect(fill = "white", colour = "white")
  )

Plotting the histograms:

Code
portfolioPrices_df <- as_tibble(portfolioPrices, rownames = "date")
portfolioPrices_df$date <- ymd(portfolioPrices_df$date)

portfolioReturs_df <- na.omit( ROC( portfolioPrices ), type = "discrete" ) |>
  as_tibble(rownames = "date")
portfolioReturs_df$date <- ymd(portfolioReturs_df$date)
colnames(portfolioReturs_df) <- c(
  "date",
  "ret_corn_fut",
  "ret_wheat_fut",
  "ret_KCWheat_fut",
  "ret_rice_fut",
  "ret_Feeder_Cattle",
  "ret_soymeal_fut",
  "ret_soyF_fut",
  "ret_soybeans_fut"
)

# Remover a coluna com nome NA
portfolioReturs_df <- portfolioReturs_df[, !is.na(colnames(portfolioReturs_df))]

# Verificar novamente os nomes das colunas para garantir que estão corretos
colnames(portfolioReturs_df)
[1] "date"              "ret_corn_fut"      "ret_wheat_fut"    
[4] "ret_KCWheat_fut"   "ret_rice_fut"      "ret_Feeder_Cattle"
[7] "ret_soymeal_fut"   "ret_soyF_fut"      "ret_soybeans_fut" 
Code
portfolioReturs_long <- portfolioReturs_df |> 
  pivot_longer(
    cols = -date, # Exclui a coluna de data
    names_to = "fut_type", 
    values_to = "returns"
  )

ggplot(portfolioReturs_long, aes(x = returns)) + 
  geom_histogram(aes(y = ..density..), binwidth = .01, color = "black", fill = "white") +
  geom_density(alpha = .2, fill="lightgray") +
  theme_minimal() +
  theme(
    axis.line  = element_line(colour = "black"),
    axis.text  = element_text(colour = "black"),  
    axis.ticks = element_line(colour = "black"), 
    legend.position = c(.1,.9), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank()
  ) +
  theme(plot.title   = element_text(size = 10),  
        axis.title.x = element_text(size = 7), 
        axis.title.y = element_text(size = 7)) + 
  labs(x = "Returns", y = "Density") +
  facet_wrap(~fut_type, scales = "free", ncol = 2) 

And finnaly, the last feature, is called, the conditional variance (risk measure), obtained by GARCH(1,1) model, formalized as:

The GARCH(1,1) model with asymmetric Student-t distribution can be represented mathematically as:

\[ r_t = \mu + \epsilon_t \]

\[ \epsilon_t = \sigma_t z_t, \quad z_t \sim t_{\nu}(0, 1) \]

\[ \sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2 + \beta \sigma_{t-1}^2 \]

Where:

  • \(r_t\) is the return at time \(t\).
  • \(\mu\) is the mean of the returns.
  • \(\epsilon_t\) is the error term, modeled as conditional on past information.
  • \(\sigma_t^2\) is the conditional variance at time \(t\).
  • \(\omega, \alpha, \beta\) are the parameters to be estimated, with \(\omega > 0, \alpha \geq 0, \beta \geq 0\).
  • \(z_t\) follows an asymmetric Student-t distribution with \(\nu\) degrees of freedom to better capture the heavy tails and skewness observed in financial returns.
Code
# Load necessary packages
library(rugarch)

# Define the GARCH(1,1) model specification with Student-t distribution
spec <- ugarchspec(
  variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
  mean.model = list(armaOrder = c(0, 0), include.mean = TRUE),
  distribution.model = "std" # Using Student-t distribution
)

# Estimate the model for each asset in the portfolio and extract conditional variances
garch_models <- list()
conditional_variances <- list()

for (i in colnames(portfolioReturs)) {
  garch_models[[i]] <- ugarchfit(spec, data = portfolioReturs[[i]])
  conditional_variances[[i]] <- sigma(garch_models[[i]])^2
}

# Convert conditional variances list to a data frame
conditional_variances_df <- do.call(cbind, conditional_variances) %>%
  as.data.frame() %>%
  mutate(time = seq_along(conditional_variances[[1]]))

colnames(conditional_variances_df) <- c(
  "cond_var_corn_fut",
  "cond_var_wheat_fut",
  "cond_var_KCWheat_fut",
  "cond_var_rice_fut",
  "cond_var_Feeder_Cattle",
  "cond_var_soymeal_fut",
  "cond_var_soyF_fut",
  "cond_var_soybeans_fut",
  "time"
)

# Reshape data for plotting
conditional_variances_long <- conditional_variances_df %>%
  pivot_longer(!time, names_to = "Variables", values_to = "Value")

And the plot of the conditional variance (risk):

Code
conditional_variances_long |> 
  group_by(Variables) |>
  plot_time_series(
    time,
    Value,
    .interactive = F, # Change for TRUE for better visualization
    .facet_ncol = 2,
    .smooth = FALSE
  ) +
  theme(
    strip.background = element_rect(fill = "white", colour = "white")
  )

 

 


References

Gujarati, D., N. (2004) Basic Econometrics, fourth edition, The McGraw−Hill Companies

Hair, J. F., Black, W. C., Babin, B. J., & Anderson, R. E. (2019). Multivariate Data Analysis. Pearson.

Hyndman, R.J., & Athanasopoulos, G. (2021) Forecasting: principles and practice, 3rd edition, OTexts: Melbourne, Australia. OTexts.com/fpp3. Accessed on oct 2023.

 

 


Code
# Total timing to compile this Quarto document

end_time = datetime.now()
time_diff = end_time - start_time

print(f"Total Quarto document compiling time: {time_diff}")
Total Quarto document compiling time: 0:02:13.954388